%matplotlib inline
import sys
sys.path.append('/home/ubuntu/tools/python-genomics')
import Scanpyplus
from importlib import reload
import pandas as pd
import scanpy.api as sc
import anndata
import os
import numpy as np
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80,dpi_save=300,color_map='PuRd')
sc.logging.print_version_and_date()
import matplotlib
import matplotlib.pyplot as plt
plt.show()
matplotlib.rcParams.update({'figure.figsize': (8,8)})
import pandasPlus
#Scanpy tutorial is here https://icb-scanpy-tutorials.readthedocs-hosted.com/en/latest/pbmc3k.html
adata = sc.read_10x_mtx(
'./filtered_gene_bc_matrices/hg19/',# the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names
cache=True)
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata = adata[adata.obs.n_genes < 2500, :]
adata = adata[adata.obs.percent_mito < 0.05, :]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.3)
bdata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(bdata, ['n_counts', 'percent_mito'])
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pp.neighbors(bdata, n_neighbors=10, n_pcs=40)
sc.tl.umap(bdata)
sc.tl.leiden(bdata)
sc.pl.umap(bdata, color='leiden', legend_loc='on data')
bdata
[bdata, output1, output2, output3] = Scanpyplus.DeepTree(bdata,
MouseC1ColorDict2={False:'#000000',True:'#00FFFF'}, #This specifies rowlabel colors
cell_type='leiden', #This specieis the .obs variable you are interested in
row_cluster=True,col_cluster=True #This lets the function clusters the rows and columns
)
This generates three clustermaps stored in output1, output2 and output3
The first clustermap is an overview of the top 4000 highly-variable genes. As you can see only a small subset of them show coherent patterns. Color bars on the top shows cell cluster IDs from the previous straight forward clustering.
The second clustermap highlights the DeepTree-selected genes using cyan on the left. The selection gets rid of most sporadic noises.
The third clustermap shows how cell clustering will improve if we only use the DeepTree-selected genes
bdata.var['Deep']
#Now use these genes to replace highly-variable genes for data analysis.
bdata=bdata[:,bdata.var['Deep']]
sc.tl.pca(bdata, svd_solver='arpack')
sc.pp.neighbors(bdata, n_neighbors=10, n_pcs=40)
sc.tl.umap(bdata)
sc.tl.leiden(bdata)
sc.pl.umap(bdata, color='leiden', legend_loc='on data')
As we can see more clusters emerge. But are they interesting? Let's check
#transfer the coordinates and clustering info back to the full dataset
adata.obsm['X_umap']=bdata.obsm['X_umap']
adata.obs['leiden']=bdata.obs['leiden']
#Plot the known marker genes
sc.pl.umap(adata,color=['IL7R','CD14','LYZ','MS4A1','CD8A','GNLY', 'NKG7','FCGR3A', 'MS4A7','FCER1A', 'CST3','PPBP'],
color_map='jet')
Indeed all the major cell types are still captured. What about the new types and subtypes?
sc.pl.umap(adata,color=['RUFY1','TPM1','PGRMC1','GNG7','ACAP1'],color_map='jet')
It will be interesting to follow the subtle differences between these subclusters to see whether they do have biological meanings